import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_column', 100)
df = pd.read_csv('train.csv', index_col=None)
df.head()
df.info()
round(df.isnull().sum()/len(df.index), 2)>0.4
There are 4 primary columns having more than 50% missing values in them. Also, looking at the column decription they can be removed because there are other variables present in the data serving same business descriptiom. For e.g.
drop50 = ['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'FireplaceQu']
# Dropping columns with high missing values
df = df.drop(drop50, axis=1)
# sum it up to check how many rows have all missing values
df.isnull().all(axis=1).sum()
# Number of rows having missing values greater 5, 7, 10
print(len(df[df.isnull().sum(axis=1) > 5].index)/len(df))
print(len(df[df.isnull().sum(axis=1) > 7].index)/len(df))
print(len(df[df.isnull().sum(axis=1) > 10].index)/len(df))
df = df.drop(['LotFrontage'], axis=1)
# Dropping all rows in the dara-set having more than 5 missing values. Since the
df = df[df.isnull().sum(axis=1) <= 5]
print(df.shape)
df.info()
# Number of rows having missing values greater 2, 3, 4
print(len(df[df.isnull().sum(axis=1) > 2].index)/len(df))
print(len(df[df.isnull().sum(axis=1) > 3].index)/len(df))
print(len(df[df.isnull().sum(axis=1) > 4].index)/len(df))
len(df[df.isnull().sum(axis=1) > 2].index)
numerical = [var for var in df.columns if df[var].dtype!='O']
categorical = [var for var in df.columns if df[var].dtype=='O']
df[numerical].isnull().sum()
df[categorical].isnull().sum()
pd.set_option('display.max_rows', 150)
df[df.isnull().sum(axis=1)>2][['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']]
df[df.isnull().sum(axis=1)>2][['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']]
This means that either the houses don't have garage and basement or this error is due to data recording. It's better to remove these values since no garage has already been taken care of in categories which means it's an error in data recording.
df = df[df.isnull().sum(axis=1) <= 2]
print(df.shape)
df.info()
# Imputing mean valus in numerical variable - MasVnrArea
df.loc[np.isnan(df['MasVnrArea']), 'MasVnrArea'] = df['MasVnrArea'].mean()
# Imputing Categorical Variables with most frequent values
df['MasVnrType'] = df['MasVnrType'].astype('category')
print(df['MasVnrType'].value_counts())
df['BsmtExposure'] = df['BsmtExposure'].astype('category')
print(df['BsmtExposure'].value_counts())
df['Electrical'] = df['Electrical'].astype('category')
print(df['Electrical'].value_counts())
df['BsmtFinType2'] = df['BsmtFinType2'].astype('category')
print(df['BsmtFinType2'].value_counts())
df['MasVnrType'] = df['MasVnrType'].fillna('None')
df['BsmtExposure'] = df['BsmtExposure'].fillna('No')
df['Electrical'] = df['Electrical'].fillna('SBrkr')
df['BsmtFinType2'] = df['BsmtFinType2'].fillna('Unf')
df.isnull().sum()>0
df.info()
numerical = [var for var in df.columns if df[var].dtype!='O']
categorical = [var for var in df.columns if df[var].dtype=='O']
print(len(numerical))
print(len(categorical))
# Pairwise scatter plot for numerical variables
numeric1 = numerical[:20]
numeric2 = numerical[20:]
plt.figure(figsize=(20, 10))
sns.pairplot(df[numeric1])
plt.show()
plt.figure(figsize=(20, 10))
sns.pairplot(df[numeric2])
plt.show()
num_cat = [i for i in numerical if len(df[i].unique())<=5]
num_cat
plt.figure(figsize = (22, 20))
sns.heatmap(df.corr(), cmap="YlGnBu", annot = True)
plt.show()
df.describe()
sns.distplot(df['SalePrice'])
plt.show()
from datetime import datetime
year = datetime.now().year
df['house_age'] = year - df['YearBuilt']
df['house_age']
sns.distplot(df['house_age'])
plt.show()
df = df.drop(['YearBuilt', 'YearRemodAdd'], axis=1)
numerical.remove('YearBuilt')
numerical.remove('YearRemodAdd')
df.shape
num_cat.append('BsmtFinType2')
categorical.extend(num_cat)
numerical = [i for i in numerical if i not in num_cat]
print(len(categorical))
print(len(numerical))
numerical.remove('Id')
# Scaling numerical variables using MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
df[numerical] = mms.fit_transform(df[numerical])
df.head()
_list = ['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'KitchenAbvGr', 'Fireplaces', 'GarageCars', 'YrSold']
categorical = [i for i in categorical if i not in _list]
dfcategorical = df[categorical]
dfcategorical.head()
dummies = pd.get_dummies(dfcategorical, drop_first=True)
dummies.head()
# drop categorical variables
df = df.drop(categorical, axis=1)
# concat dummy variables with X
df = pd.concat([df, dummies], axis=1)
df.head()
X = df.drop(['Id', 'SalePrice'], axis=1)
y = df['SalePrice']
# split into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size = 0.2, random_state=100)
# list of alphas to tune
params = {'alpha': [0.00005, 0.0001, 0.0002, 0.0003, 0.0005, 0.0007, 0.0009, 0.001, 0.005, 0.007, 0.01, 0.05, 0.07,
0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}
ridge = Ridge()
# cross validation
folds = 5
model_cv = GridSearchCV(estimator = ridge, param_grid = params, scoring= 'r2', cv = folds,
return_train_score=True, verbose = 1)
model_cv.fit(X_train, y_train)
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results = cv_results[cv_results['mean_train_score']>=0.85]
cv_results.head()
# plotting mean test and train scoes with alpha
cv_results['param_alpha'] = cv_results['param_alpha'].astype('int32')
# plotting
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('R^2 Error')
plt.title("R^2 Vs alpha")
plt.legend(['train score', 'test score'], loc='upper left')
plt.show()
alpha = 0.0001
ridge = Ridge(alpha=alpha)
ridge.fit(X_train, y_train)
ridge.coef_
lasso = Lasso()
# cross validation
model_cv = GridSearchCV(estimator = lasso, param_grid = params, scoring= 'r2', cv = folds,
return_train_score=True, verbose = 1)
model_cv.fit(X_train, y_train)
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results.head()
# plotting mean test and train scoes with alpha
cv_results['param_alpha'] = cv_results['param_alpha'].astype('float32')
# plotting
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
plt.xscale('log')
plt.xlabel('alpha')
plt.ylabel('R^2 Error')
plt.title("R^2 Vs alpha")
plt.legend(['train score', 'test score'], loc='upper left')
plt.show()
alpha = 0.0001
lasso = Lasso(alpha=alpha)
lasso.fit(X_train, y_train)
lasso.coef_
from sklearn.metrics import r2_score
y_pred = ridge.predict(X_test)
print('Ridge Test R2 Score', r2_score(y_test, y_pred))
y_pred = lasso.predict(X_test)
print('Lasso Test R2 Score', r2_score(y_test, y_pred))
from sklearn.feature_selection import RFE
rfe = RFE(ridge, 10)
rfe = rfe.fit(X_train, y_train)
col1 = list(X_train.columns[rfe.support_])
col1
rfe = RFE(lasso, 10)
rfe = rfe.fit(X_train, y_train)
col2 = list(X_train.columns[rfe.support_])
col2
finalcol = [i for i in col1 if i in col2]
finalcol
col1.append('SalePrice')
col2.append('SalePrice')
plt.figure(figsize = (12, 8))
sns.heatmap(df[col1].corr(), cmap="YlGnBu", annot = True)
plt.show()
plt.figure(figsize = (12, 8))
sns.heatmap(df[col2].corr(), cmap="YlGnBu", annot = True)
plt.show()
finalcol.append('GrLivArea')
finalcol.append('GarageArea')
finalcol.append('1stFlrSF')
finalcol.append('LotArea')
finalcol.append('MasVnrArea')
finalcol
xtrain_new = X_train[finalcol]
xtest_new = X_test[finalcol]
lasso = Lasso()
# cross validation
model_cv = GridSearchCV(estimator = lasso, param_grid = params, scoring= 'r2', cv = folds,
return_train_score=True, verbose = 1)
model_cv.fit(xtrain_new, y_train)
# plotting mean test and train scoes with alpha
cv_results['param_alpha'] = cv_results['param_alpha'].astype('float32')
# plotting
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
plt.xscale('log')
plt.xlabel('alpha')
plt.ylabel('R^2')
plt.title("R^2 Vs alpha")
plt.legend(['train score', 'test score'], loc='upper left')
plt.show()